Load required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Load and inspect MVP voting data
mvp_voting <- read_csv('Data/mvp_voting.csv')
## Rows: 719 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Rank, Player, Tm
## dbl (18): Age, First, Pts Won, Pts Max, Share, G, MP, PTS, TRB, AST, STL, BL...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mvp_voting
Select only columns pertaining to MVP voting results
mvp_voting <- mvp_voting %>% select(Player, Year, `Pts Won`, `Pts Max`, Share)
mvp_voting
Load and inspect player stats
player_stats <- read_csv('Data/player_stats.csv')
## Rows: 23881 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Player, Pos, Tm
## dbl (28): Rk, Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
player_stats
Remove Rank column
player_stats <- player_stats %>% select(-Rk)
player_stats
Remove asterisks after names
player_stats$Player <- str_replace_all(player_stats$Player, fixed("*"), "")
player_stats
Convert NA values for percentages to zeros. This also converts games started to zeros for those predating when that metric began being tracked. I will not use this column for my models, so it should have no impact
player_stats <- player_stats %>% mutate(across(everything(), ~ replace_na(.x, 0)))
Group the dataframe by the combined player and year. Then, handle cases where a player played for multiple teams in one season by representing only the last team played for
handle_multiple_teams <- function(df) {
if (nrow(df) == 1) {
return(df)
}
else {
row <- df %>% filter(Tm == 'TOT')
if (nrow(row) == 0) {
return(df)
}
row$Tm <- as.character(df[nrow(df), "Tm"])
return(row)
}
}
player_stats$Tm <- as.character(player_stats$Tm)
player_stats <- player_stats %>% group_by(Player, Year) %>% group_modify(~ handle_multiple_teams(.x))
player_stats <- player_stats %>% ungroup()
player_stats
Merge MVP voting with player stats
player_stats_with_mvp_voting <- full_join(player_stats, mvp_voting, by = c("Player" = "Player", "Year" = "Year")) %>% mutate(
`Pts Won` = replace_na(`Pts Won`, 0),
`Pts Max` = replace_na(`Pts Max`, 0),
Share = replace_na(Share, 0)
)
player_stats_with_mvp_voting
Load and inspect team stats
team_stats = read_csv('Data/team_stats.csv')
## Rows: 1254 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GB, Team
## dbl (7): W, L, W/L%, PS/G, PA/G, SRS, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
team_stats
Remove asterisks and seeds from team names
team_stats$Team <- str_replace_all(team_stats$Team, fixed("*"), "")
team_stats$Team <- str_replace_all(team_stats$Team, "\\([^\\)]+\\)", "")
team_stats$Team <- str_squish(team_stats$Team)
team_stats
Change dashes for games back to zeros
team_stats <- team_stats %>% mutate(GB = str_replace_all(GB, '—', '0'))
team_stats
Convert games back from characters to numeric
team_stats <- team_stats %>% mutate(GB = as.numeric(GB))
team_stats
Load mapping from full name to abbreviation
abbreviations <- list()
lines <- read_lines("Data/abbreviations.csv")
for (line in lines[-1]) {
split_line <- strsplit(line, ",")[[1]]
abbreviation <- split_line[1]
name <- split_line[2]
abbreviations[[abbreviation]] <- name
}
Add full names to player stats with MVP voting
player_stats_with_mvp_voting <- player_stats_with_mvp_voting %>% mutate(Team = recode(Tm, !!!abbreviations))
player_stats_with_mvp_voting
Merge player stats with MPV voting with team stats
everything <- full_join(player_stats_with_mvp_voting, team_stats, by = c("Team" = "Team", "Year" = "Year"))
everything
Save combined stats to csv
write_csv(everything, 'Data/combined_stats.csv')